Paso 3

Análisis de clases latentes: modelo seleccionado sin predictores, caracterización de clases y medidas de ajuste

Andrés González Santa Cruz
April 23, 2023
Show code
script src = "https://ajax.googleapis.com/ajax/libs/jquery/3.4.1/jquery.min.js"
Show code
 $(document).ready(function() {
    $('body').prepend('<div class=\"zoomDiv\"><img src=\"\" class=\"zoomImg\"></div>');
    // onClick function for all plots (img's)
    $('img:not(.zoomImg)').click(function() {
      $('.zoomImg').attr('src', $(this).attr('src')).css({width: '100%'});
      $('.zoomDiv').css({opacity: '1', width: 'auto', border: '1px solid white', borderRadius: '5px', position: 'fixed', top: '50%', left: '50%', marginRight: '-50%', transform: 'translate(-50%, -50%)', boxShadow: '0px 0px 50px #888888', zIndex: '50', overflow: 'auto', maxHeight: '100%'});
    });
    // onClick function for zoomImg
    $('img.zoomImg').click(function() {
      $('.zoomDiv').css({opacity: '0', width: '0%'}); 
    });
  });
  
Show code
<script src="hideOutput.js"></script> 
Show code
$(document).ready(function() {    
    $chunks = $('.fold');    
    $chunks.each(function () {      // add button to source code chunks     
    if ( $(this).hasClass('s') ) {       
        $('pre.r', this).prepend("<div class=\"showopt\">Show Source</div><br style=\"line-height:22px;\"/>");
            $('pre.r', this).children('code').attr('class', 'folded');     
            }      // add button to output chunks     
        if ( $(this).hasClass('o') ) {       
            $('pre:not(.r)', this).has('code').prepend("<div class=\"showopt\">Show Output</div><br style=\"line-height:22px;\"/>");       
            $('pre:not(.r)', this).children('code:not(r)').addClass('folded');        // add button to plots       
            $(this).find('img').wrap('<pre class=\"plot\"></pre>');       
            $('pre.plot', this).prepend("<div class=\"showopt\">Show Plot</div><br style=\"line-height:22px;\"/>");       
            $('pre.plot', this).children('img').addClass('folded');      
            }   
});    // hide all chunks when document is loaded   
    $('.folded').css('display', 'none')    // function to toggle the visibility   
    $('.showopt').click(function() {     
            var label = $(this).html();     
            if (label.indexOf("Show") >= 0) {       
                $(this).html(label.replace("Show", "Hide"));     
            } else {
              $(this).html(label.replace("Hide", "Show"));     
            }     
    $(this).siblings('code, img').slideToggle('fast', 'swing');   
    }); 
}); 

Cargamos los datos

Show code
rm(list = ls());gc()
         used (Mb) gc trigger (Mb) max used (Mb)
Ncells 535876 28.7    1209218 64.6   643711 34.4
Vcells 908881  7.0    8388608 64.0  1649632 12.6
Show code
load("data2_lca2_2023_04_21.RData")

Cargamos los paquetes

Show code
knitr::opts_chunk$set(echo = TRUE)

if(!require(poLCA)){install.packages("poLCA")}
if(!require(poLCAParallel)){devtools::install_github("QMUL/poLCAParallel@package")}
if(!require(compareGroups)){install.packages("compareGroups")}
if(!require(parallel)){install.packages("parallel")}
if(!require(Hmisc)){install.packages("Hmisc")}
if(!require(tidyverse)){install.packages("tidyverse")}
try(if(!require(sjPlot)){install.packages("sjPlot")})
if(!require(emmeans)){install.packages("emmeans")}
if(!require(nnet)){install.packages("nnet")}
if(!require(here)){install.packages("here")}
if(!require(doParallel)){install.packages("doParallel")}
if(!require(progress)){install.packages("progress")}
if(!require(caret)){install.packages("caret")}
if(!require(rpart)){install.packages("rpart")}
if(!require(rpart.plot)){install.packages("rpart.plot")}
if(!require(partykit)){install.packages("partykit")}
if(!require(randomForest)){install.packages("randomForest")}
if(!require(ggcorrplot)){install.packages("ggcorrplot")}
if(!require(polycor)){install.packages("polycor")}
if(!require(tableone)){install.packages("tableone")}
if(!require(broom)){install.packages("broom")}
if(!require(plotly)){install.packages("plotly")}

#if(!require(poLCA)){githubinstall::gh_install_packages("poLCA", ref = github_pull("14"))}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
lca_dir<-here::here()
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#
#  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

tryNA <- function(x){
    x <- try(x)
    if(inherits(x,'try-error')) return(NA)
    x
}

#https://rdrr.io/github/hyunsooseol/snowRMM/src/R/lca.b.R
#https://github.com/dlinzer/poLCA/issues/7

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#' Bivariate residuals for latent class models
#' 
#' Calculate the "bivariate residuals" (BVRs) between pairs of variables 
#' in a latent class model.
#' 
#' This function compares the model-implied (expected) counts in the crosstables
#' of all pairs of observed dependent variables to the observed counts. For each
#' pair, it calculates a "chi-square" statistic,
#' 
#' \deqn{\text{BVR} = \sum_{j, j'} \frac{(n_{jj'} - e_{jj'})^2}{e_{jj'}}},
#' 
#' where \eqn{n_{jj'}} are the observed counts for categories \eqn{j} and \eqn{j'} 
#' of the variables being crosstabulated, and \eqn{e_{jj'}} are
#' the expected counts under the latent class model. 
#' 
#' Note that the BVR does not follow an asymptotic chi-square distribution and
#' for accurate p-values, parametric bootstrapping is necessary (Oberski et al. 2013).
#' 
#' @param fit A poLCA fit object
#' @param tol Optional: tolerance for small expected counts
#' @param rescale_to_df Optional: whether to divide the pairwise "chi-square" values by 
#' the degrees of freedom of the local crosstable. Default is TRUE.
#' @return The table of bivariate residuals
#' @author Daniel Oberski (daniel.oberski@gmail.com)
#' @seealso \code{\link{poLCA}} for fitting the latent class model.
#' @references 
#' Oberski, DL, Van Kollenburg, GH and Vermunt, JK (2013). 
#'   A Monte Carlo evaluation of three methods to detect local dependence in binary data latent class models. 
#'   Advances in Data Analysis and Classification 7 (3), 267-279.
#' @examples
#' data(values)
#' f <- cbind(A, B, C, D) ~ 1
#' M0 <- poLCA(f,values, nclass=1, verbose = FALSE) 
#' bvr(M0) # 12.4, 5.7, 8.3, 15.6, ... 
bvr <- function(fit, tol = 1e-3, rescale_to_df = TRUE) {
  stopifnot(class(fit) == "poLCA")

  ov_names <- names(fit$predcell)[1:(ncol(fit$predcell) - 2)]
  ov_combn <- combn(ov_names, 2)

  get_bvr <- function(ov_pair) {
    form_obs <- as.formula(paste0("observed ~ ", ov_pair[1], " + ", ov_pair[2]))
    form_exp <- as.formula(paste0("expected ~ ", ov_pair[1], " + ", ov_pair[2]))

    counts_obs <- xtabs(form_obs, data = fit$predcell)
    counts_exp <- xtabs(form_exp, data = fit$predcell)
    counts_exp <- ifelse(counts_exp < tol, tol, counts_exp) # Prevent Inf/NaN

    bvr_df <- prod(dim(counts_exp) - 1)
    bvr_value <- sum((counts_obs - counts_exp)^2 / counts_exp)

    if(rescale_to_df) bvr_value <- bvr_value / bvr_df

    attr(bvr_value, "df") <- bvr_df

    bvr_value
  }

  bvr_pairs <- apply(ov_combn, 2, get_bvr)

  attr(bvr_pairs, "rescale_to_df") <- rescale_to_df
  attr(bvr_pairs, "class") <- "dist"
  attr(bvr_pairs, "Size") <- length(ov_names)
  attr(bvr_pairs, "Labels") <- ov_names
  attr(bvr_pairs, "Diag") <- FALSE
  attr(bvr_pairs, "Upper") <- FALSE

  bvr_pairs
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
poLCA.entropy.fix <- function (lc)
{
  K.j <- sapply(lc$probs, ncol)
  fullcell <- expand.grid(sapply(K.j, seq, from = 1))
  P.c <- poLCA.predcell(lc, fullcell)
  return(-sum(P.c * log(P.c), na.rm = TRUE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#Calculate entropy R2 for poLCA model

# MIT license
# Author: Daniel Oberski
# Input: result of a poLCA model fit
# Output: entropy R^2 statistic (Vermunt & Magidson, 2013, p. 71)
# See: daob.nl/wp-content/uploads/2015/07/ESRA-course-slides.pdf
# And: https://www.statisticalinnovations.com/wp-content/uploads/LGtecnical.pdf
machine_tolerance <- sqrt(.Machine$double.eps)
entropy.R2 <- function(fit) {
  entropy <- function(p) {
    p <- p[p > machine_tolerance] # since Lim_{p->0} p log(p) = 0
    sum(-p * log(p))
  }
  error_prior <- entropy(fit$P) # Class proportions
  error_post <- mean(apply(fit$posterior, 1, entropy))
  R2_entropy <- (error_prior - error_post) / error_prior
  R2_entropy
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#http://researchdata.gla.ac.uk/879/1/Survey_data_processed_using_R.pdf
##Function to plot variable probabilites by latent class

## Function to undertake chisquare analayis and plot graphs of residuals and contributions
chisquaretest.predictions.function <-
 function(indfactor.data,
   predclass.data,
   noclasses,
   pitem,
   gitem,
   chirows,
   chicols) {
   chisquare.results <- chisq.test(indfactor.data, predclass.data)
   residuals.data <- chisquare.results$residuals
   colnames(residuals.data) <- chicols
   rownames(residuals.data) <- chirows
     title.text <-
       paste(
       "Residuals: chi Square Crosstabulation of\n",
       pitem,
       "and",
       gitem,
       "\n(Chisquare =",
       round(chisquare.results$statistic, 3),
       " p <",
       round(chisquare.results$p.value, 3),
       ")",
       sep = " "
       )
     corrplot(
       residuals.data,
       is.cor = FALSE,
       title = title.text,
       mar = c(0, 0, 4, 0)
       )
     contrib.data <-
     100 * residuals.data ^ 2 / chisquare.results$statistic
     round(contrib.data, 3)
     colnames(contrib.data) <- chicols
     rownames(contrib.data) <- chirows
     title.text <-
     paste(
       "Contributions: chi Square Crosstabulation of\n",
       pitem,
       "and",
       gitem,
       "\n(Chisquare =",
       round(chisquare.results$statistic, 3),
       " p <",
       round(chisquare.results$p.value, 3),
       ")",
       sep = " "
       )
     corrplot(
       contrib.data,
       is.cor = FALSE,
       title = title.text,
       mar = c(0, 0, 4, 0)
       )
     return(chisquare.results)
 }
##Funciton for Cramers V test
cv.test = function(x, y) {
   CV = sqrt(chisq.test(x, y, correct = FALSE)$statistic /
   (length(x) * (min(
   length(unique(x)), length(unique(y))
   ) - 1)))
   print.noquote("Cramér V / Phi:"))
   return(as.numeric(CV))
  }

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

if(.Platform$OS.type == "windows") withAutoprint({
  memory.size()
  memory.size(TRUE)
  memory.limit(size=56000)
})

path<-try(dirname(rstudioapi::getSourceEditorContext()$path))

options(knitr.kable.NA = '')


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

knitr::knit_hooks$set(time_it = local({
  now <- NULL
  function(before, options) {
    if (before) {
      # record the current time before each chunk
      now <<- Sys.time()
    } else {
      # calculate the time difference after a chunk
      res <- ifelse(difftime(Sys.time(), now)>(60^2),difftime(Sys.time(), now)/(60^2),difftime(Sys.time(), now)/(60^1))
      # return a character string to show the time
      x<-ifelse(difftime(Sys.time(), now)>(60^2),paste("Time for this code chunk to run:", round(res,1), "hours"),paste("Time for this code chunk to run:", round(res,1), "minutes"))
      paste('<div class="message">', gsub('##', '\n', x),'</div>', sep = '\n')
    }
  }
}))
knitr::opts_chunk$set(time_it = TRUE)
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)
#
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

as.data.frame.TableOne <- function(x, ...) {capture.output(print(x,
                          showAllLevels = TRUE, ...) -> x)
  y <- as.data.frame(x)
  y$charactersitic <- dplyr::na_if(rownames(x), "")
  y <- y %>%
  fill(charactersitic, .direction = "down") %>%
  select(charactersitic, everything())
  rownames(y) <- NULL
  y}

Definimos ciertas constantes

Show code
clus_iter= 1e3#500 #500
n_thread <- parallel::detectCores()
nrep <- clus_iter # number of different initial values (could be n_thread too)
n_class_max <- 12 # maximum number of classes to investigate
n_bootstrap <- 100 #30 # 50 number of bootstrap samples
print(n_thread)
[1] 8

Time for this code chunk to run: 0 minutes

Análisis de clases latentes

Seleccionar modelo final

Show code
#Si probs.start se establece en NULL (predeterminado) al llamar Polca, a continuación, la función genera los valores de partida al azar en cada ejecución. Esto significa que repite carreras de Polca normalmente producirán resultados con los mismos parámetros estimados (correspondiente a la misma el máximo diario de probabilidad), pero con etiquetas de clase latentes reordenados

#https://drive.google.com/file/d/10njMh5JEcqaBgCnoZdJ1uk3uCEkocDez/view?usp=share_link
#http://daob.nl/wp-content/uploads/2015/07/ESRA-course-slides.pdf
#https://docs.google.com/document/d/1LVeDpAP6CfT3n8B6HhHcc_SRnzO0JBoT/edit

#bvr(ppio_newList$lc_entropy_best_model)

#A list of matrices of class-conditional response probabilities to be used as the starting values for the estimation algorithm. Each matrix in the list corresponds to one manifest variable, with one row for each latent class, and one column for each outcome. The default is NULL, producing random starting values. Note that if nrep>1, then any user-specified probs.start values are only used in the first of the nrep attempts.

#The poLCA.reorder function takes as its first argument the list of starting values probs.start, and as its second argument a vector describing the desired reordering of the latent classes.
new.probs.start <-  poLCA.reorder(LCA_best_model_ppio$probs.start, order(LCA_best_model_ppio$P, decreasing = TRUE))
#new.probs.start <-poLCA.reorder(probs.start,c(4,1,3,2))
#A continuación, ejecute PoLCA, una vez más, esta vez utilizando los valores iniciales reordenados en la llamada de función.

#The argument nrep=5 tells the program to repeat nrep times, and keep the fit with the highest likelihood to try to avoid local maxima.

#.maxiter – The maximum number of iterations through which the estimation algorithm will cycle.
#.nrep - Number of times to estimate the model, using different values of probs.start. (default is one)
set.seed(2125)
LCA_best_model_mod<-
   poLCA(f_preds, mydata_preds, nclass=length(LCA_best_model_ppio$P), 
         maxiter=10000,tol=1e-5, na.rm=FALSE,nrep=1e3, verbose=TRUE, calc.se=TRUE,probs.start=new.probs.start) 
Conditional item response (column) probabilities,
 by outcome variable, for each class (row) 
 
$CAUSAL
          Pr(1)  Pr(2)  Pr(3)  Pr(4)
class 1:      0 0.1839 0.0000 0.8161
class 2:      0 0.0586 0.9414 0.0000
class 3:      0 0.9407 0.0593 0.0000
class 4:      0 0.0030 0.0000 0.9970
class 5:      0 0.4188 0.5812 0.0000
class 6:      0 0.1001 0.8999 0.0000

$EDAD_MUJER_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:  0.0075 0.3214 0.2603 0.2450 0.1428 0.0230
class 2:  0.0000 0.0030 0.0907 0.2900 0.5387 0.0777
class 3:  0.0000 0.0161 0.2547 0.4024 0.2910 0.0358
class 4:  0.0000 0.3195 0.3184 0.2172 0.1225 0.0225
class 5:  0.0313 0.0305 0.2559 0.3474 0.2765 0.0584
class 6:  0.0000 0.0350 0.2840 0.3126 0.3024 0.0660

$PUEBLO_ORIGINARIO_REC
           Pr(1)  Pr(2)  Pr(3)
class 1:  0.9821 0.0000 0.0179
class 2:  0.0000 1.0000 0.0000
class 3:  0.0000 0.9521 0.0479
class 4:  0.0000 0.9524 0.0476
class 5:  0.9227 0.0754 0.0019
class 6:  0.0000 0.9344 0.0656

$PAIS_ORIGEN_REC
           Pr(1)  Pr(2)  Pr(3)
class 1:  0.0000 0.8216 0.1784
class 2:  0.0000 0.9246 0.0754
class 3:  0.0000 0.7748 0.2252
class 4:  0.0000 0.7746 0.2254
class 5:  0.0331 0.8272 0.1397
class 6:  0.0000 0.8230 0.1770

$HITO1_EDAD_GEST_SEM_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:  0.0000 0.7885 0.2115 0.0000 0.0000 0.0000
class 2:  0.0078 0.0034 0.6057 0.2663 0.0986 0.0181
class 3:  0.0674 0.1673 0.2552 0.4953 0.0077 0.0071
class 4:  0.0129 0.7872 0.1999 0.0000 0.0000 0.0000
class 5:  0.0300 0.0000 0.2823 0.3049 0.2342 0.1485
class 6:  0.0036 0.0000 0.3205 0.3517 0.2387 0.0855

$MACROZONA
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:  0.0298 0.4449 0.2139 0.0909 0.0825 0.1380
class 2:  0.0000 0.6751 0.1150 0.0658 0.0648 0.0794
class 3:  0.0000 0.4340 0.1539 0.1482 0.1261 0.1378
class 4:  0.0000 0.4225 0.1429 0.1238 0.1371 0.1736
class 5:  0.0129 0.3772 0.2023 0.1510 0.0959 0.1606
class 6:  0.0000 0.3599 0.1654 0.1852 0.1204 0.1692

$AÑO
          Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)  Pr(6)
class 1:      0 1.0000 0.0000 0.0000 0.0000 0.0000
class 2:      0 0.0250 0.2684 0.2233 0.2623 0.2210
class 3:      0 0.0087 0.2978 0.1657 0.2618 0.2661
class 4:      0 0.0235 0.2200 0.2476 0.2090 0.2999
class 5:      0 1.0000 0.0000 0.0000 0.0000 0.0000
class 6:      0 0.0180 0.2599 0.2225 0.2916 0.2081

$PREV_TRAMO_REC
           Pr(1)  Pr(2)  Pr(3)  Pr(4)  Pr(5)
class 1:  0.0075 0.0679 0.6379 0.2193 0.0675
class 2:  0.0000 0.7924 0.0000 0.2076 0.0000
class 3:  0.0012 0.0870 0.5552 0.3294 0.0272
class 4:  0.0065 0.0515 0.6818 0.1976 0.0627
class 5:  0.0129 0.1175 0.5569 0.2800 0.0327
class 6:  0.0007 0.0000 0.6490 0.3416 0.0086

Estimated class population shares 
 0.0354 0.104 0.2148 0.1645 0.1434 0.3379 
 
Predicted class memberships (by modal posterior prob.) 
 0.0351 0.0876 0.2286 0.1639 0.1433 0.3415 
 
========================================================= 
Fit for 6 latent classes: 
========================================================= 
number of observations: 3789 
number of estimated parameters: 191 
residual degrees of freedom: 3598 
maximum log-likelihood: -31793.6 
 
AIC(6): 63969.2
BIC(6): 65161.01
G^2(6): 7425.912 (Likelihood ratio/deviance statistic) 
X^2(6): 138206.3 (Chi-square goodness of fit) 
 
Show code
output_LCA_best_model_mod<-capture.output(LCA_best_model_mod)
glance_LCA_best_model_mod<-glance(LCA_best_model_mod)
mydata_preds_LCA1 <- augment(LCA_best_model_mod, data = mydata_preds3)

Time for this code chunk to run: 0.7 minutes

Show code
# fig.height=15, 
## If you are interested in the population-shares of the classes, you can get them like this:
warning(paste("Probabilidades posteriores: ",
  paste(round(colMeans(LCA_best_model_mod$posterior)*100,2), collapse=", ")
  ))
## or you inspect the estimated class memberships:
warning(paste("Probabildiades predichas: ",
  paste(round(prop.table(table(LCA_best_model_mod$predclass)),4)*100, collapse=", ")
  ))

traductor_cats <- readxl::read_excel("tabla12.xlsx") %>% 
  dplyr::mutate(level=readr::parse_double(level)) %>% 
  dplyr::mutate(charactersitic=gsub(" \\(%\\)", "", charactersitic))



lcmodel <- reshape2::melt(LCA_best_model_mod$probs, level=2)
lcmodel<- lcmodel %>% 
  dplyr::mutate(pr=as.numeric(gsub("[^0-9.]+", "", Var2))) %>% 
  dplyr::left_join(traductor_cats[,c("charactersitic", "level", "CATEGORIA")], by= c("L2"="charactersitic", "pr"="level"))

lcmodel$text_label<-paste0("Categoria:",lcmodel$CATEGORIA,"<br>%: ",scales::percent(lcmodel$value))


zp1 <- ggplot(lcmodel,aes(x = L2, y = value, fill = Var2, label=text_label))
zp1 <- zp1 + geom_bar(stat = "identity", position = "stack")
zp1 <- zp1 + facet_grid(Var1 ~ .) 
zp1 <- zp1 + scale_fill_brewer(type="seq", palette="Greys", na.value = "white") +theme_bw()
zp1 <- zp1 + labs(y = "Porcentaje de probabilidad de respuesta", 
                  x = "",
                  fill ="Cateorías de\nRespuesta")
zp1 <- zp1 + theme( axis.text.y=element_blank(),
                    axis.ticks.y=element_blank(),                    
                    panel.grid.major.y=element_blank())
zp1 <- zp1 + guides(fill = guide_legend(reverse=TRUE))
zp1 <- zp1 + theme(axis.text.x = element_text(angle = 30, hjust = 1))
#print(zp1)

ggplotly(zp1, tooltip = c("text_label"))%>% layout(xaxis= list(showticklabels = T),height=600, width=800)

Figure 1: Selected Model

Time for this code chunk to run: 0 minutes

Show code
#In this case, residuals are actual cell counts vs. expected cell counts. 
bvr_LCA_best_model_mod<-bvr(LCA_best_model_mod)

#conditional probabilities
#Pr(B1=1|Class 3)
posteriors <- data.frame(LCA_best_model_mod$posterior, predclass=LCA_best_model_mod$predclass) 

classification_table <- plyr::ddply(posteriors, "predclass", function(x) colSums(x[,1:length(LCA_best_model_ppio$P)]))
clasification_errors<- 1 - sum(diag(as.matrix(classification_table[,2:(length(LCA_best_model_ppio$P)+1)]))) / sum(classification_table[,2:(length(LCA_best_model_ppio$P)+1)]) 

warning(paste("Error de clasificación: ", round(clasification_errors,2)))

Warning: Error de clasificación: 0.07

Show code
entropy_alt <- function(p) sum(-p * log(p))
error_prior <- entropy_alt(LCA_best_model_mod$P) # Class proportions
error_post <- mean(apply(LCA_best_model_mod$posterior, 1, entropy_alt),na.rm=T)
R2_entropy_alt <- (error_prior - error_post) / error_prior
warning(paste("Entropía: ", round(R2_entropy_alt,2)))

Warning: Entropía: 0.53

Show code
#https://stackoverflow.com/questions/72783185/entropy-calculation-gives-nan-is-applying-na-omit-a-valid-tweak
entropy.safe <- function (p) {
  if (any(p > 1 | p < 0)) stop("probability must be between 0 and 1")
  log.p <- numeric(length(p))
  safe <- p != 0
  log.p[safe] <- log(p[safe])
  sum(-p * log.p)
}

error_prior2 <- entropy.safe(LCA_best_model_mod$P) # Class proportions
error_post2 <- mean(apply(LCA_best_model_mod$posterior, 1, entropy.safe),na.rm=T)
R2_entropy_safe <- (error_prior2 - error_post2) / error_prior2
warning(paste("Entropía segura: ", round(R2_entropy_safe,2)))

Warning: Entropía segura: 0.88

Show code
entropy.brutal <- function (p) {
  if (any(p > 1 | p < 0)) stop("probability must be between 0 and 1")
  log.p <- log(p)
  ## as same as sum(na.omit(-p * log.p))
  sum(-p * log.p, na.rm = TRUE)
}

error_prior3 <- entropy.brutal(LCA_best_model_mod$P) # Class proportions
error_post3 <- mean(apply(LCA_best_model_mod$posterior, 1, entropy.brutal),na.rm=T)
R2_entropy_brutal <- (error_prior3 - error_post3) / error_prior3
warning(paste("Entropía brutal: ", round(R2_entropy_brutal,2)))

Warning: Entropía brutal: 0.88

Show code
#https://gist.github.com/daob/c2b6d83815ddd57cde3cebfdc2c267b3
warning(paste("Entropía (solución Oberski): ", round(entropy.R2(LCA_best_model_mod),2)))

Warning: Entropía (solución Oberski): 0.88

Show code
#\#minimum average posterior robability of cluster membership (\>0.7), interpretability (classes are clearly distinguishable), and parsimony (each class has a sufficient sample size for further analysis; n≥50).

Time for this code chunk to run: 0 minutes


Ver si la exclusión de casos que no calzan en alguna clase tiene consecuencias.


Show code
#To evaluate whether the exclusion of cases would bias the LCA results, a sensitivity analysis was carried out. We conducted T-Tests and Wilcoxon–Mann–Whitney tests (for non-parametric data) to compare included and excluded records in terms of demographic and clinical background characteristics and baseline pain scores (all 638 patients completed pain intensity, frequency and impairment scales).
tidy(LCA_best_model_mod) %>% 
  # dplyr::mutate(variable= dplyr::case_when(variable=="naq1"~"naq01",
  #                              variable=="naq2"~"naq02",
  #                              variable=="naq3"~"naq03",
  #                              variable=="naq4"~"naq04",
  #                              variable=="naq5"~"naq05",
  #                              variable=="naq6"~"naq06",
  #                              variable=="naq7"~"naq07",
  #                              variable=="naq8"~"naq08",
  #                              variable=="naq9"~"naq09",
  #                              TRUE~variable)) %>% 
ggplot(aes(outcome, estimate, color = factor(class), group = class)) +
  geom_line() +
  facet_wrap(~variable, nrow = 4) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  theme_bw()+
  theme(legend.position = "bottom")
Modelo seleccionado

Figure 2: Modelo seleccionado

Time for this code chunk to run: 0 minutes

Show code
#########################################################
#########################################################
### Posterior probability calculation                 ###
### Assign class based on maximum probability         ###
###   Note: additional prep for Table1 package        ###
###         1) Convert all categorical variables to   ###
###            factors                                ###
###         2) Continuous variables as numeric        ###
###         3) Pull out number from census strings    ###
#########################################################
#########################################################

Time for this code chunk to run: 0 minutes

Show code
save.image("data2_lca3.RData")

Time for this code chunk to run: 0 minutes

Show code
require(tidyverse)
sesion_info <- devtools::session_info()
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
) %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Variable' = 2, 'Percentage'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '50%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",#;
        "}")))

Time for this code chunk to run: 0 minutes